# -*- coding: utf-8 -*-
"""
1) Takes downsampled points, calculate fuel and emissions and save to tracks dataset
2) Merges and cleans the speed bin data set
3) Merges and cleans points data (for bins, grid, NA ECA speed profile)
4) Joins Entrance/Clearance Data

Created on Fri Aug 16 09:25:42 2019

@author: rklotz
"""

import pandas as pd
import numpy as np
import os

import final_track_functions

import merge_EC_full

###############################################################################
years =  [ 2009 , 2010 ,  2011, 2012, 2013, 2014, 2015, 2016]

# Input files
# v2 - includes interpolated in addition to fragments
# v2a - includes NA ECA and interp flag
# downsampled point CSVs
data_folder = r"H:\My Drive\Boats\ReplicationCode\data"
input_version = r""

output_folder = r"H:\My Drive\Boats\ReplicationCode\final_data"
output_version = "" 

ds_pnts_csv = os.path.join(data_folder,'AIS','voyages_v2a',"points_ds"+input_version+"_%s.csv")

# speed bins CSVs
speed_bins_csv = os.path.join(data_folder,'AIS','voyages_v2a',"speed_bins"+input_version+"_%s.csv")

# track files (using these to get OID - IMO map; and for some route statistics)
tracks_csv = os.path.join(data_folder,'AIS','voyages_v2a',"voyages"+input_version+"_%s.csv")

# entrance/exits with different study area boundaries
entexit_nobound_csv = os.path.join(data_folder,'AIS','voyages_v2a',"voyages_entexit_nobound"+input_version+".csv")
    
# vessel chars
vessel_char_csv =  os.path.join(data_folder+"_not_to_share","wfr_wHist_09182020.csv") # includes owner/operator vars

# mmsi imo map (results of matching in stata)
mmsi_imo_dta = os.path.join(data_folder,'AIS','voyages_v2a',"mmsi_imo_my.dta")

# port names xlsx file
ports_decs = os.path.join(data_folder,"port_decs_wAK_HI.xls")

# mid to country ISO code map
mid2cntry_file = os.path.join(data_folder,"mid_iso_map.csv")

# entrance clearance files
# more aggregate region definitions
agg_region = os.path.join(data_folder,"ECport_agg.xls")

# EC data after being cleaned in stata
EC_dta = os.path.join(data_folder,'entrance_clearance','ec0716_clean.dta')

# output files - for tracks #################################
out_csv = os.path.join(output_folder,"CA_tracks%s.csv" % output_version )
#out_dta = os.path.join(output_folder,"CA_tracks_wAP2_04082020_v2a.dta" )
out_other_csv = os.path.join(output_folder,"CA_tracks_other%s.csv" % output_version)

# saving speed bin file
out_bins_csv = os.path.join(output_folder,"CA_speed_bins%s.csv" % output_version )
#out_bins_dta = os.path.join(output_folder,"CA_speed_bins_04082020_v2a.dta")

# saving csv with points and vessel chars
# used for vessel speed profiles etc
out_csv_points = os.path.join(output_folder,"all_points%s" % output_version + "_%s.csv"  )
out_dta_points = os.path.join(output_folder,"all_points%s"  % output_version + "_%s.dta"  )

# saving csv with points and vessel chars (for NA ECA analysis)
out_csv_pnts_naeca = os.path.join(output_folder,"naeca_points%s"  % output_version + "_%s.csv"  )
out_dta_pnts_naeca = os.path.join(output_folder,"naeca_points%s"  % output_version + "_%s.dta" )

out_csv_points_grid = os.path.join(output_folder, ( "all_points_grid%s"  % output_version ) + "_%s.csv" )


###### ASSUMPTIONS 

# Update AP2 damage estimates to VSL consistent with ISRM
# ISRM VSL is 8.3 million in 2011 dollars
# AP2 VSL in 5.9 million in 2000 dollars
ap2_scale = 8300000/5907840 

### Emissions Factors (tons pollutant per ton fuel)
# from IMO 2014 
# for main engines, we calculate tons of fuel, so need emissions factors per ton fuel

# sfoc - main engine (slow speed)
# 195 - residual
# 185 - distillate
dist_F_scale = 185/195 # scalar to reduce fuel consumption by when using distillate
                       # not using at the moment

PM_scale = 0.133

PM_res = 0.00728            # ton per ton fuel (Table 22 IMO 2014)
PM_dist = PM_res * PM_scale     # scaling down based on scaling factors in IMO 2014 - linear interpolation
SO2_res = 0.027 * .9754 * 2     # based on sulfur content of fuel (2.7%) 
SO2_dist = 0.001 * .9754 * 2    # based on sulfur content of fuel (0.1%)
# sulfur content * fraction of sulfur as sox * 2 (to get weight in terms of sox) - all sox is SO2 for vessels

df_EF = pd.DataFrame( [ [PM_res , SO2_res] , [PM_dist, SO2_dist] ] , columns = ['PM','SO2'] , index=['res','dist'])


# for auxiliary engines
# we calculate kWh, so want emissions factors per kw 
# but need sfoc to calculate fuel consumption

# sfoc - auxiliary engine (medium or high)
# 227 - residual
# 217 - distillate
sfoc_aux = 227   * 1e-6              # g fuel/kwh for medium speed aux engine (Table 22) --- CONVERTED TO t fuel/kwh

PM_res_aux = 0.00634
PM_dist_aux = PM_res_aux*PM_scale

df_EF_aux = pd.DataFrame( [ [PM_res_aux , SO2_res] , [PM_dist_aux, SO2_dist] ] , columns = ['PM','SO2'] , index=['res','dist'])

#### NOx Emissions Factors t pollution per t fuel
# not changing for distillate, but do depend on tier of vessel (which we model by age)
nox_data =   [ [0,  0.093 , 0.065], [1,  0.09, 0.05727]  , [2,  0.07846, 0.04934] ] 
df_EF_NOx = pd.DataFrame( nox_data , columns = ['Tier','EF','EFaux'] )
del nox_data

#### VOC Emissions factor (t pollution per t fuel)
EF_VOC = 0.00308 # IMO 2015 - Table 32 

###############################################################################

def main() :
    
    # creating output directory if needed
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
            
    # get track to IMO mapping
    print('Getting track to IMO map')
    df_oid_imo , df_oid_mmsi_date = final_track_functions.get_imo_by_track(tracks_csv,mmsi_imo_dta,years) 
    # need to match on oid year here
    
    # load ports file
    ports_df = pd.read_excel(ports_decs)
    ports_df = ports_df[ ['port_id','port_name','portcode_SN','port_agg'] ]
    #print ports_df
    
    # load mid to cntry file 
    mid2cntry_df = pd.read_csv(mid2cntry_file)
    mid2cntry_df = mid2cntry_df[ ['mid','flag_cntry','flag_country'] ]
    
    # load vessel chars and add some required variables
    df_ves_char = pd.read_csv( vessel_char_csv , index_col=False ,  low_memory=False) 
    df_ves_char = df_ves_char.replace(r'\r',' ', regex=True)
    df_ves_char = df_ves_char.replace(r'\n',' ', regex=True)
        
    # adding IMO Tier based on year built
    # for NOx calculations
    df_ves_char['Tier'] = 0 
    df_ves_char.loc[ df_ves_char['built'] >=2000 , 'Tier'] = 1
    df_ves_char.loc[ df_ves_char['built'] >=2011 , 'Tier'] = 2
    
    df_ves_char = df_ves_char.merge(df_EF_NOx, on='Tier' , how='left') # merge in efs
    
    # dropping a bunch of characteristics we don't use
    df_ves_char.drop(columns=['Ballast','Bunker','Fuel','Hull No','Main Engine Detail','Main Engine Model',\
                              'Main Engine Series' , 'Main Engine Summary' , 'Maximum speed','OFC Vessel Types',\
                              'Passengers', 'Propeller' , 'SFC Vessel Types', 'builder' , 'cap_ifo','cap_mdo','cap_mgo',\
                              'crude_capacity_bbl','engine_dtl' , 'engine_make' , 'engine_model' , 'flag' ,\
                              'lng_capacity_m3' , 'vehicles' , 'total_cap' , 'type' , 'vehicles' , 'beam_interp',\
                             'loa_interp', 'draft_interp', 'built_interp',  'power_kw_interp', 'speed_kts_interp',\
                            'dwt_interp', 'cons_td_interp','aux_sea_interp', 'built_orig', 'cons_td_orig',  \
                            'draft_orig', 'dwt_orig', 'group_merge', 'loa_orig', \
                            'power_kw_orig', 'register', 'speed_kts_orig', 'aux_sea_orig' ] , inplace=True )    
    
      
    # load mmsi imo map
    #df_mmsi_imo = pd.read_stata( mmsi_imo_dta )
    #df_mmsi_imo['m'] = df_mmsi_imo['my'].dt.month
    #df_mmsi_imo['y'] = df_mmsi_imo['my'].dt.year
    #df_mmsi_imo = df_mmsi_imo.rename(columns={'mmsi':'MMSI'})
    
    # getting just vessel chars for fuel calcs
    # and for speed profiles
    ves_fuel_chars = ['group_agg','alpha_power','alpha_cons','Power Type','aux_sea','EF','EFaux'] # 'aux_sea_lin'
    df_ves_char_fuel = df_ves_char[['IMO']+ves_fuel_chars]
    
    print('Getting and cleaning nobound entrance/exits')
    df_nobound = final_track_functions.get_nobound( entexit_nobound_csv , df_oid_mmsi_date, df_ves_char_fuel , sfoc_aux, EF_VOC ) 
        
    # loop through years
    df_list = []
    #df_km_bins_list = [] ;
    df_pnts_list = [] 
    for year in years :
        print('Loading and Cleaning {}'.format(year))
        # load down sampled track points    
        df_pnts =  pd.read_csv( ds_pnts_csv % year , index_col=False )
        
        df_pnts['AISyear'] = year
        df_pnts = df_pnts.replace(-9999,np.nan) # replace missing values as nan
        df_pnts['IMO'] = df_pnts.IMO.replace(0,np.nan) # replace IMO
           
        # renaming D_SO2_SO4 column
        df_pnts = df_pnts.rename(columns={'D_SO2_SO4':'D_SO2'})
        
        # remove rows with zero distance or time, which will have nans and 
        df_pnts.drop(df_pnts[df_pnts['s3_dt']==0].index, inplace = True)
        
        # getting list of variables
        #pnts_vars = list( df_pnts.columns.values ) 
        
        df_pnts['m'] = pd.to_datetime( df_pnts['TIME1']).dt.month
        df_pnts['y'] = pd.to_datetime( df_pnts['TIME1']).dt.year
                
        # replace points without damage estimates
        # if missing use lowest observed marginal damage on that track
        # if still missing assing to lowest observed 
        pol_vars = ['D_PM','D_SO2','D_PM_inm','D_SO2_inm','D_NOx_inm','D_VOC_inm','D_NH3_inm']
        for v in pol_vars :
            df_pnts[v] = df_pnts[v].fillna(df_pnts.groupby('OID')[v].transform('min'))
            df_pnts[v] = df_pnts[v].fillna(df_pnts[v].min())
       
        # Scaling AP2 values to adjust VSL
        ap2_vars = ['D_PM','D_SO2']
        for v in ap2_vars :
            df_pnts[v] = df_pnts[v]*ap2_scale
                      
        # storing original IMO
        df_pnts['imo_orig'] = df_pnts.IMO   
        
        if year in [2010,2011,2012,2013,2014]:
            # merge in im mmsi map
            df_pnts = df_pnts.merge(df_oid_imo,on=['OID','AISyear'],how='left')
            df_pnts['IMO'] = df_pnts.imo_m 
            #df_pnts = df_pnts.drop(columns='imo_m')
            #df_pnts = df_pnts.rename( columns={'imo_m':'IMO'} )
            
        #else :
               
        # flag point if near a crude terminal (works better than track creation algorithm)
        df_pnts['elsegundo'] = np.where(df_pnts.GRID_ID==12466,1,0) 
        df_pnts['rosarito'] = np.where(df_pnts.GRID_ID==17780,1,0)
        df_pnts['ensenada'] = np.where(df_pnts.GRID_ID==16138,1,0)
                
        # merge chars
        df_pnts = df_pnts.merge(df_ves_char_fuel, on='IMO' , how='left')
        
            
        # determine whether track crosses na eca (merging in later)
        df_naeca_shr = df_pnts.groupby(by='OID',as_index=False)['in_naeca'].mean()
        df_naeca_shr['naeca_cross'] = np.where( (df_naeca_shr.in_naeca>0) & (df_naeca_shr.in_naeca<1) , 1, 0 )
        df_pnts = df_pnts.merge(df_naeca_shr[['OID','naeca_cross']],on=['OID'],how='left')
        del df_naeca_shr
        
        # calulate fuel consumption at each point   
        print('Power, Fuel and Emissions Calculations')
    
        # power used outside eca    
        df_pnts['s3_dt_out09'] = df_pnts.s3_dt - df_pnts.s3_dt_eca09
        df_pnts['s3_dt_out11'] = df_pnts.s3_dt - df_pnts.s3_dt_eca11
        
        # calculating fraction of power in ECA
        df_pnts['s3_dt_shr_eca09'] = ( df_pnts.s3_dt_eca09/df_pnts.s3_dt ).replace(np.nan,0)
        df_pnts['s3_dt_shr_eca11'] = ( df_pnts.s3_dt_eca11/df_pnts.s3_dt ).replace(np.nan,0) 
        
        # calculating time in and out of ECA
        df_pnts['d_time_hr_eca09'] = df_pnts['d_time_hr'] * df_pnts['s3_dt_shr_eca09'] 
        df_pnts['d_time_hr_eca11'] = df_pnts['d_time_hr'] * df_pnts['s3_dt_shr_eca11']
        df_pnts['d_time_hr_out09'] = df_pnts['d_time_hr'] * ( 1 - df_pnts['s3_dt_shr_eca09'] )
        df_pnts['d_time_hr_out11'] = df_pnts['d_time_hr'] * ( 1 - df_pnts['s3_dt_shr_eca11'] )
            
        # calculating power, hours in 2011 and 2009 ECA, and 2011 ECA but not 2009 ECA
        for v in ['s3_dt','d_time_hr'] :
            df_pnts[ v + '_eca11and09' ] = df_pnts[[ v+'_eca09', v+'_eca11' ]].min(axis=1)
            df_pnts[ v + '_eca11not09' ] = df_pnts[v+'_eca11'] - df_pnts[v+'_eca11and09']
            
        # calculate fuel
        tags = ['_power','_cons']
        eca_tags = ["","_out09","_out11","_eca09","_eca11","_eca11and09","_eca11not09"]
        for eca_tag in eca_tags :
            df_pnts['Faux' +eca_tag]  = sfoc_aux * df_pnts['aux_sea'] * df_pnts['d_time_hr'+eca_tag] 
                                        # t per kwh * kw * hr --- tons fuel
            for tag in tags :
                df_pnts['Fmain'+eca_tag+tag] = df_pnts["alpha"+tag] * df_pnts["s3_dt"+eca_tag]
                df_pnts['F'    +eca_tag+tag] = df_pnts['Fmain'+eca_tag+tag] + df_pnts['Faux'+eca_tag]
        del eca_tags        
    
              
        # calculate emissions
        # need to account for main and aux engines seperately
        # this nest is for SO2 and PM
        pols = ['PM','SO2']
        for pol in pols :
            df_pnts['Eaux_'+pol+'_pre']  = df_pnts['Faux'] * df_EF_aux.loc['res',pol] 
            df_pnts['Eaux_'+pol+'_eca09']  =  df_pnts['Faux_out09'] * df_EF_aux.loc['res',pol] + df_pnts['Faux_eca09'] * df_EF_aux.loc['dist',pol]
            df_pnts['Eaux_'+pol+'_eca11']  = df_pnts['Faux_out11'] * df_EF_aux.loc['res',pol] + df_pnts['Faux_eca11'] * df_EF_aux.loc['dist',pol]
 
            for tag in tags :
                # pre-eca always using residual
                df_pnts['Emain_'+pol+'_pre'+tag]  = df_pnts['Fmain'+tag] * df_EF.loc['res',pol] 
                df_pnts['E_'+pol+'_pre'+tag]  = df_pnts['Emain_'+pol+'_pre'+tag] + df_pnts['Eaux_'+pol+'_pre']
                
                # obeying eca boundaries
                # these are quantities of emissions at each point if complied with specified boundaries
                df_pnts['Emain_'+pol+'_eca09'+tag]  = df_pnts['Fmain_out09'+tag] * df_EF.loc['res',pol] + df_pnts['Fmain_eca09'+tag] * df_EF.loc['dist',pol] 
                df_pnts['E_'+pol+'_eca09'+tag]  = df_pnts['Emain_'+pol+'_eca09'+tag] + df_pnts['Eaux_'+pol+'_eca09']
                
                
                df_pnts['Emain_'+pol+'_eca11'+tag] = df_pnts['Fmain_out11'+tag] * df_EF.loc['res',pol] + df_pnts['Fmain_eca11'+tag] * df_EF.loc['dist',pol] 
                df_pnts['E_'+pol+'_eca11'+tag]  = df_pnts['Emain_'+pol+'_eca11'+tag] + df_pnts['Eaux_'+pol+'_eca11'] 
                        
                        
                    
        # calculate NOx emissions
        # doesn't depend on fuel type
        #df_pnts = df_pnts.merge(df_EF_NOx, on='Tier' , how='left') # merge in efs
        for tag in tags:
            df_pnts['E_NOx'+tag]  = df_pnts['Fmain'+tag] * df_pnts['EF'] + df_pnts['Faux'] * df_pnts['EFaux']
        df_pnts = df_pnts.drop(['EF', 'EFaux'], axis=1)     
            
        # calculate VOC emissions
        # doesnt depend on engine or fuel type
        for tag in tags:
            df_pnts['E_VOC'+tag]  = ( df_pnts['Fmain'+tag]  + df_pnts['Faux'] ) * EF_VOC 
           
        # calculate damages ($), by pollutant and then total
        # this nest is for SO2 and PM
        cases = ['_pre','_eca09','_eca11']
        damages = ['' , '_inm'] # which damages to use - inmap or ap2
        for case in cases :
            for tag in tags :
                for d in damages :
                    for pol in pols : 
                        # aux calculations here don't depend on tag, so doing twice, but calculation is the same each time
                        
                        df_pnts['TD_'+pol+d+case+tag] = df_pnts['E_'+pol+case+tag] * df_pnts['D_'+pol+d] # total damage
                        df_pnts['TDmain_'+pol+d+case+tag] = df_pnts['Emain_'+pol+case+tag] * df_pnts['D_'+pol+d] # total damage
                        df_pnts['TDaux_'+pol+d+case] = df_pnts['Eaux_'+pol+case] * df_pnts['D_'+pol+d] # total damage 
                    
                    # aggregate damages across pollutants 
                    df_pnts['TD'+d+case+tag] =  df_pnts['TD_PM'+d+case+tag] + df_pnts['TD_SO2'+d+case+tag]   
                    df_pnts['TDmain'+d+case+tag] = df_pnts['TDmain_PM'+d+case+tag] + df_pnts['TDmain_SO2'+d+case+tag]  
                    df_pnts['TDaux'+d+case] = df_pnts['TDaux_PM'+d+case] + df_pnts['TDaux_SO2'+d+case]   
                    
                    # dropping emissions specific damages intermediate cases
                    df_pnts = df_pnts.drop(labels=['TDmain_PM'+d+case+tag,'TDmain_SO2'+d+case+tag],axis='columns')
                    df_pnts = df_pnts.drop(labels=['TDaux_PM'+d+case,'TDaux_SO2'+d+case],axis='columns')

                    
        # do NOx and VOC calculations
        for tag in tags:
            df_pnts['TD_NOx'+tag]  = df_pnts['E_NOx'+tag] * df_pnts['D_NOx_inm']     
            df_pnts['TD_VOC'+tag]  = df_pnts['E_VOC'+tag] * df_pnts['D_VOC_inm']     
            
                
        # calculating values for inside/outside ECA for distillate and residual fuel
        # using these to construct intermediate fuel cases
        eca_tags = ["_out09","_out11","_eca09","_eca11","_eca11and09","_eca11not09"]  
        fuel_tags = ["res","dist"]      
        for tag in ["_cons"] :      # only doing this for _cons to limit extra vars
            for fuel_tag in fuel_tags :
                for eca_tag in eca_tags :
                    df_pnts['E_SO2_'+fuel_tag+eca_tag+tag] = df_pnts['F'+eca_tag+tag] * df_EF.loc[fuel_tag,'SO2'] + df_pnts['Faux'+eca_tag] * df_EF_aux.loc[fuel_tag,'SO2'] 
                    df_pnts['E_PM_'+fuel_tag+eca_tag+tag] = df_pnts['F'+eca_tag+tag] * df_EF.loc[fuel_tag,'PM'] + df_pnts['Faux'+eca_tag] * df_EF_aux.loc[fuel_tag,'PM'] 
                    
                    df_pnts['TD_inm_'+fuel_tag+eca_tag+tag] = df_pnts['E_SO2_'+fuel_tag+eca_tag+tag] * df_pnts['D_SO2_inm']  \
                                                            + df_pnts['E_PM_'+fuel_tag+eca_tag+tag] * df_pnts['D_PM_inm'] 
    
        del eca_tags
    
        # getting list of variables
        pnts_vars2 = list( df_pnts.columns.values ) 
                     
        # getting vars with specific tags to sum
        sum_vars  = [] 
        for v in pnts_vars2 :
            #if any(ext in v for ext in ["TD_","F_","Fmain_","Faux","E_"] ):  # ,"s3_"
            if v.startswith( ("TD_","TDmain","TDaux","F_","Fmain_","Faux","E_","Emain_","Eaux_") ) :
                sum_vars.append(v)
        #sum_vars = sum_vars + ['km_onland']
        
        drop_vars = ['pnt_ID','GRID_ID','elsegundo','rosarito','ensenada','date_ds_apeep','ID_pnt','pnt_ID','X','Y', \
                     'kmh','km_cum','in_eca09','in_eca11','d_km','d_time_hr' \
                     , 's3_dt' , 's3_dt_eca09' , 's3_dt_eca11' \
                     ,'s3_dt_out09','s3_dt_out11','s3_dt_shr_eca09','s3_dt_shr_eca11','s3_dt_shr_eca11and09','s3_dt_shr_eca11not09'\
                    ,'d_time_hr_eca09'  , 'd_time_hr_eca11','d_time_hr_out09','d_time_hr_out11','d_time_hr_out11and09','d_time_hr_out11not09' \
                    , 'kmpop_us','kmpop_ca','kmpop_la','kmpop_lasf','distUScoast_km'\
                    ,'D_PM','D_SO2','D_PM_inm','D_SO2_inm','D_NOx_inm','D_VOC_inm','D_NH3_inm','onland','km_onland']
        first_vars = [x for x in pnts_vars2 if x not in sum_vars + drop_vars  + ves_fuel_chars ]
    
        # saving points df for output
        # dropping imo_m column from points
        df_pnts_list.append(df_pnts.drop(columns='imo_m',errors='ignore'))   
    
    
        # collapse to track 
        # min_count=1 forces calculation to occur only if 1 or more obs
        df_track_y = df_pnts[ sum_vars + ['OID'] ].groupby('OID').sum(min_count=1)
        df_track_first = df_pnts[ first_vars ].groupby('OID').first()
        df_track_min = df_pnts[['OID','kmh'] ].groupby('OID').min()
        df_track_min = df_track_min.rename( columns={ 'kmh':'kmh_min' } )
        df_track_max = df_pnts[['OID','kmh','elsegundo','rosarito','ensenada']].groupby('OID').max()
        df_track_max = df_track_max.rename( columns={ 'kmh':'kmh_max' } )
        df_track_y = df_track_y.merge(df_track_first,on="OID",how="left") 
        df_track_y = df_track_y.merge(df_track_min,on="OID",how="left") 
        df_track_y = df_track_y.merge(df_track_max,on="OID",how="left") 
        del df_track_min
        del df_track_max 
        del df_track_first   
        
        
        # resetting index, to get OID back as column
        df_track_y.reset_index(level=0, inplace=True)
        #a = df_track_y[['OID','TD_inm_pre_cons','TDaux_inm_pre_cons']]
        #a = df_track_y.columns
    
        # adding other track level outcomes calculated in the joining marginal damage step
        df_raw_tracks_y =  pd.read_csv( tracks_csv % year , index_col=False )
        df_track_y = df_track_y.merge(df_raw_tracks_y[['OID','km_onland','km_studyarea','km_naeca','naeca_interp'\
                                                       ,'km_interp_naeca','km_interp_naeca_less160','km_interp_naeca_more160','kmh_cruise']],on="OID",how="left")
      
    
        # calculating total emissions damage per ton
        cases = ['_pre','_eca09','_eca11'] # compliance cases
        for case in cases :
            for tag in tags :
                for d in damages :
                    df_track_y['TDtonne'+d+case+tag] = df_track_y['TD'+d+case+tag] / df_track_y['F'+tag] # damage per ton fuel
            
        for tag in tags :
            df_track_y['TDtonne_NOx'+tag] = df_track_y['TD_NOx'+tag] / df_track_y['F'+tag] # damage per ton fuel
            df_track_y['TDtonne_VOC'+tag] = df_track_y['TD_VOC'+tag] / df_track_y['F'+tag] # damage per ton fuel
        
        
        # Flagging tracks that were used in interpolated tracks   
        # first creating a track ID for the interpolated tracks
        # then assigning this track ID to all tracks used in interpolation
           
        track_id_vars = ['MMSI','INTERP_TRACKS','INTERP_FLAG'] # these identify the interpolated tracks
        df_tmp = df_track_y.loc[df_track_y.INTERP_FLAG==1,track_id_vars].drop_duplicates()
        df_tmp["TRACK_ID_INTERP"] = df_tmp.groupby(track_id_vars, as_index=True).ngroup() + df_track_y.TRACK_ID.max() + 1 
        df_track_y = df_track_y.merge ( df_tmp  , on=track_id_vars , how='left')
        df_track_y.TRACK_ID_INTERP.fillna(df_track_y.TRACK_ID, inplace=True)
        #a = df_track_y[['TRACK_ID_INTERP','INTERP_FLAG']]
        df_track_y.TRACK_ID = df_track_y.TRACK_ID_INTERP
        df_track_y.drop(columns='TRACK_ID_INTERP',inplace=True)
    
        # getting frame with interpolated tracks and the TRACK_IDs that were used in interpolation    
        # then mergining in (so we can determine which tracks were used in interpolation)
        df_tmp = df_track_y.loc[df_track_y.INTERP_FLAG==1,['TRACK_ID','INTERP_TRACKS']].drop_duplicates()  # .str.get_dummies(sep=',')
        df_tmp = df_tmp.set_index(['TRACK_ID']).apply(lambda x: x.str.split(',').explode()).reset_index() # this splits interpolated track_ids and stacks
        df_tmp.INTERP_TRACKS = df_tmp.INTERP_TRACKS.astype(int)
        df_tmp = df_tmp.rename(columns={'TRACK_ID':'INTERP_TRACK_ID','INTERP_TRACKS':'TRACK_ID'})
        df_track_y = df_track_y.merge(df_tmp,on='TRACK_ID',how='left')
        df_track_y.INTERP_TRACK_ID.fillna(0,inplace=True)
        del df_tmp
        

        # storing tracks
        df_list.append(df_track_y)
                      
        del df_track_y
    
    del df_ves_char_fuel
    
    print("Merging Tracks")
    df_track = pd.concat(df_list,axis=0,ignore_index=True,sort=True)
    del df_list 
    
    # calculating hours
    df_track['hours_py'] = ( pd.to_datetime( df_track['TIME2'] ) - pd.to_datetime( df_track['TIME1'] ) )
    df_track['hours_py'] = df_track.hours_py.dt.total_seconds()/3600
    
    
    print("Load and Merge Speed Bins Files")
    df_bins_list = []
    for year in years :
        print('Loading and Cleaning {}'.format(year))
        # load down sampled track points    
        df_bins_y =  pd.read_csv( speed_bins_csv % year , index_col=False )
        
        df_bins_y['AISyear'] = year
        df_bins_y = df_bins_y.replace(-9999,np.nan) # replace missing values as nan
        df_bins_y['IMO'] = df_bins_y.IMO.replace(0,np.nan) # replace IMO
        
        # storing original IMO
        df_bins_y['imo_orig'] = df_bins_y.IMO   
        
        if year in [2010,2011,2012,2013,2014]:
            # merge in im mmsi map
            df_bins_y = df_bins_y.merge(df_oid_imo,on=['OID','AISyear'],how='left')
            df_bins_y['IMO'] = df_bins_y.imo_m 
            #df_pnts = df_pnts.drop(columns='IMO')
            #df_pnts = df_pnts.rename( columns={'imo_m':'IMO'} )
    
        # storing
        df_bins_list.append(df_bins_y)
        del df_bins_y
     
    del df_oid_imo    
        
    # concatenate
    df_km_bins = pd.concat(df_bins_list,axis=0,ignore_index=True,sort=True)
    df_km_bins = df_km_bins.drop(columns=['km_onland','MMSI','INTERP_FLAG']) # removing some that get added with track data
    
    # adding characteristics and cleaning
    df_track = final_track_functions.add_vars_clean(df_track,df_ves_char,ports_df,mid2cntry_df)
    del df_ves_char
    
    # renaming TRACK_ID (which is full unbroken tracks)
    df_track = df_track.rename(columns={'TRACK_ID':'FULL_TRACK_ID'})
    
    # adding track identifier
    df_track['track_id'] = np.arange(len(df_track))
    
    # flagging interpolated tracks that are not to AK/HI and AK/HI tracks that were not interp
    # using this to remove duplicates that are never used
    AK_HI_IND = ["AK","HI","Unimak"]
    long_ind = ( ( (df_track.PORT1<=12) & (df_track.port_agg2.isin(AK_HI_IND) ) ) | (( df_track.PORT2<=12 ) & (df_track.port_agg1.isin(AK_HI_IND) ) ) )
    df_track['drop_ind'] = ((long_ind==False) & (df_track.INTERP_FLAG==1)) | ( (long_ind==True) & (df_track.INTERP_FLAG==0) )
    
    # adding values for interpolated tracks
    # get frame with only interpolated tracks that reach west coast and go to AK or HI
    row_ind = ( (df_track.INTERP_FLAG==1) & (long_ind==True) )
    df_interp_wc = df_track.loc[ row_ind, ['track_id','AISyear','FULL_TRACK_ID','route_name','TIME1','TIME2'] ]
    df_interp_wc = df_interp_wc.rename(columns={'track_id':'track_id_interp','route_name':'route_name_interp','FULL_TRACK_ID':'INTERP_TRACK_ID'})
    del long_ind
    del row_ind
    
    # merge to tracks (first with TIME1, then TIME2, keep only matches)
    df_track2 = df_track[['track_id','AISyear','INTERP_TRACK_ID','TIME1','TIME2']].merge(df_interp_wc[['AISyear','INTERP_TRACK_ID','TIME1','route_name_interp','track_id_interp']],on=['AISyear','INTERP_TRACK_ID','TIME1'],how='left')
    df_track2 = df_track2.merge(df_interp_wc[['AISyear','INTERP_TRACK_ID','TIME2','route_name_interp','track_id_interp']]\
                                ,on=['AISyear','INTERP_TRACK_ID','TIME2'],how='left',suffixes=('_1','_2'))
    df_track2['route_name_interp'] = df_track2['route_name_interp_1']
    df_track2['route_name_interp'].fillna(df_track2['route_name_interp_2'],inplace=True)
    df_track2['track_id_interp'] = df_track2['track_id_interp_1']
    df_track2['track_id_interp'].fillna(df_track2['track_id_interp_2'],inplace=True)
    
    # merging to full table
    df_track = df_track.merge(df_track2[['track_id','track_id_interp','route_name_interp']],on='track_id',how='left')
    del df_track2 
    del df_interp_wc
    
    # converting time stamps to times
    df_track['TIME1'] =  pd.to_datetime(df_track['TIME1'])
    df_track['TIME2'] =  pd.to_datetime(df_track['TIME2'])
    
    ## Merging Entrance/Clearance Origin Destinations
    df_track['EnterExit'] = np.where( ( (df_track.PORT1<=12) & (df_track.PORT2>100) ) | ( (df_track.PORT2<=12) & (df_track.PORT1 >100) ) , 1  , 0 ) 
       
    # getting just enter/exits and matching to EC
    df_enter_exits = df_track.loc[df_track.EnterExit==True,:].copy()
    df_ECmatch=merge_EC_full.EC_merge(df_enter_exits , EC_dta , agg_region )
    del df_enter_exits    
    
    df_track = df_track.merge(df_ECmatch,on='track_id',how='left')
    df_track['ECmatch'] = df_track.port_name1_ECport.notna()
    del df_ECmatch
    
    # variables to include in speed bin and points files
    track_vars_to_merge = ['km_onland','route_name','flag_country_mid','dirNW','built', 'dwt' , \
                           'loa' , 'draft' , 'beam' , 'power_kw','Power Type','gt','MMSI',\
                           'track_id','FULL_TRACK_ID','INTERP_FLAG','port_agg1','port_agg2','kmh_max','merged_chars',\
                           'track_id_interp','route_name_interp','Main Engine Fuel Type','group','group_agg','drop_ind',\
                           'elsegundo' , 'rosarito' , 'ensenada','port_name1','port_name2','hours_py','port_name1_ECport','port_name2_ECport',\
                           'port_name1_ECcountry','port_name2_ECcountry','port_name1_ECregion','port_name2_ECregion' , 'Owner_Group' , 'Operator_Group' ,'EngineDispl_cul'  ] 
    
        
    # add vars from track df to bin df
    df_km_bins = df_km_bins.merge(df_track[['OID','AISyear']+track_vars_to_merge],on=['OID','AISyear'],how="left")
    df_km_bins = df_km_bins.loc[df_km_bins.drop_ind==False] # dropping unused interpolated tracks
    df_km_bins.drop(columns=['drop_ind' ] , inplace=True )  
    
    # collapsing speed bins then adding to tracks table
    # Maersk Definitions
    # below 24 kmh - very slow
    # below 30 kmh - super slow
    # below 34 kmh - slow 
    # above 40 kmh - fast
    df_km_bins['below12'] = np.where(df_km_bins['kmh_b']<12, 1, 0)  
    df_km_bins['below16'] = np.where(df_km_bins['kmh_b']<16, 1, 0)  
    df_km_bins['below24'] = np.where(df_km_bins['kmh_b']<24, 1, 0)  
    df_km_bins['below30'] = np.where(df_km_bins['kmh_b']<30, 1, 0)  
    df_km_bins['below34'] = np.where(df_km_bins['kmh_b']<34, 1, 0)  
    df_km_bins['above40'] = np.where(df_km_bins['kmh_b']>=40, 1, 0) 
    
    var_list  = ['below12','below16','below24','below30','below34','above40']
    #df_km_bins_tot = df_km_bins[ ['DIST','DIST_ECA2009','DIST_ECA2011','OID' ] ].groupby(['OID'],as_index=False).sum()
    
    
    for v in var_list :
        df_km_bins_tmp = df_km_bins[df_km_bins[v]==1]
        df_km_bins_v = df_km_bins_tmp[ ['d_km','d_km_eca09','d_km_eca11','OID' ,'AISyear'] ].groupby(['OID','AISyear'],as_index=False).sum()
        df_km_bins_v.rename(columns = {"d_km": "d_km_"+v, "d_km_eca09": "d_km_eca09_"+v, "d_km_eca11" : "d_km_eca11_"+v }, inplace=True)
        
        df_track = df_track.merge(df_km_bins_v,on=['OID','AISyear'],how="left") # merge
    
    ##### adding entrance/exits with different boundaries
    print('Adding Entrance/Exit Values with different bounds')
    len(df_track.columns)
    df_track = df_track.merge(df_nobound,how='left',on=['AISyear','OID'])
    len(df_track.columns)
    
    ## saving to csv
    ### removing tracks we don't use
    df_track_out = df_track.copy() # duplicating so we can use original below
    df_track_out = df_track_out.loc[df_track_out.drop_ind==False] # dropping unused interpolated tracks
    df_track_out.drop(columns=['drop_ind' ] , inplace=True )  
    
    tracks_to_keep = ( (df_track_out.PORT1.notnull()) & (df_track_out.PORT2.notnull()) & (df_track_out.PORT1!=df_track_out.PORT2) ) &  ~( (((df_track_out.PORT1<13) & (df_track_out.RIGHT1==1)) | ((df_track_out.PORT2<13) & (df_track_out.RIGHT2==1))) )
    #df_track_out.columns = df_track_out.columns.str.replace(' ','') # removing whitespace in names
    df_track_out.columns = df_track_out.columns.str.replace('\W', '')
    # saving stata
    #df_track_out[tracks_to_keep].to_stata(out_dta,convert_dates={'TIME1':'td' , 'TIME2':'td'},write_index=False,version=117)
    #print("Saved File: {}".format(out_dta))
    
    ### SAVING
    # saving csv
    df_track_out[tracks_to_keep].to_csv(out_csv,sep=',',index=False,header=True)
    print("Saved File: {}".format(out_csv))

    # saving other tracks (these are those within port or outside study area)
    df_track_out[~tracks_to_keep].to_csv(out_other_csv,sep=',',index=False,header=True)
    print("Saved File: {}".format(out_other_csv))
    
    #print( df_track.groupby('AISyear')[['has_imo','has_imo_orig','has_imo_matched']].mean() )
    

    print("Merging and Saving Points File")
    # these are variables to keep from points file
    # track level outcomes get merged later
    vars_to_keep = ['AISyear','OID','GRID_ID','date_ds_apeep','kmpop_us','kmpop_ca','kmpop_la','kmpop_lasf','distUScoast_km','D_PM','D_SO2','D_PM_inm','D_SO2_inm' \
                    ,'D_NOx_inm','D_VOC_inm','PORT1','PORT2','PORT_ID','TIME1','TIME2' \
                    ,'in_eca09','in_eca11','in_naeca','interp','kmh','km_cum','d_km','F_cons','IMO' ]
        
    vars_to_keep_grid = ['AISyear','OID','GRID_ID','PORT1','PORT2','PORT_ID','TIME1','TIME2','Power Type', \
                         'group_agg','IMO','kmh','d_km','F_cons']
         
    # loop through frames and select just tracks and variables we need
    df_pnts_forgrid_list = [] 
    df_pnts_naeca_list = []  
    for i, frame in enumerate(df_pnts_list):
        print("i: {}".format(i))
        track_to_keep_grid = ~( (((frame.PORT1<13) & (frame.RIGHT1==1)) | ((frame.PORT2<13) & (frame.RIGHT2==1))) ) # dropping internal only
        df_pnts_forgrid_list.append( frame.loc[track_to_keep_grid,vars_to_keep_grid] )
        
        track_to_keep_naeca = (frame.naeca_cross==1) # keeping only those that cross NA ECA
        df_pnts_naeca_list.append( frame.loc[track_to_keep_naeca,vars_to_keep] )
        
        tracks_to_keep = ( (frame.PORT1.notnull()) & (frame.PORT2.notnull()) & \
                        (frame.PORT1!=frame.PORT2) ) &  ~( (((frame.PORT1<13) & (frame.RIGHT1==1)) | ((frame.PORT2<13) & (frame.RIGHT2==1))) )
        df_pnts_list[i] = frame.loc[tracks_to_keep,vars_to_keep]
    
    df_pnts_all = pd.concat(df_pnts_list,axis=0,ignore_index=True,sort=True)
    del df_pnts_list  
    
    df_pnts_forgrid = pd.concat(df_pnts_forgrid_list,axis=0,ignore_index=True,sort=True)
    del df_pnts_forgrid_list  
    
    df_pnts_naeca = pd.concat(df_pnts_naeca_list,axis=0,ignore_index=True,sort=True)
    del df_pnts_naeca_list  
    
    # clean points for grids
    df_pnts_forgrid = df_pnts_forgrid.merge(df_track[['OID','AISyear']+track_vars_to_merge],on=['OID','AISyear'],how="left")
    df_pnts_forgrid = df_pnts_forgrid.loc[df_pnts_forgrid.INTERP_FLAG==False] # dropping all interpolated tracks    
    
    # save by year
    grouped = df_pnts_forgrid.groupby('AISyear')
    for name, group in grouped:
        group.to_csv(out_csv_points_grid % name ,sep=',',index=False,header=True)

    # saving points stata and csv
    final_track_functions.save_pnts(df_pnts_all,df_track,track_vars_to_merge,out_dta_points,out_csv_points ) # for bins analysis
    final_track_functions.save_pnts(df_pnts_naeca,df_track,track_vars_to_merge,out_dta_pnts_naeca,out_csv_pnts_naeca ) # for NA ECA boundary analysis

    # # OLD - saving as single files  %%%%%%%%%%%%%%%%%%%%%%%%  
    # # concat points files
   
    # # saving points stata and csv
    # final_track_functions.save_pnts(df_pnts_all,df_track,track_vars_to_merge,out_dta_points,out_csv_points ) # for bins analysis
    # final_track_functions.save_pnts(df_pnts_naeca,df_track,track_vars_to_merge,out_dta_pnts_naeca,out_csv_pnts_naeca ) # for NA ECA boundary analysis
    
    
    # # save points with GRID_ID to csv
    # #df_pnts_forgrid.drop(columns=track_vars_to_merge,inplace=True)
    # df_pnts_forgrid.to_csv(out_csv_points_grid,sep=',',index=False,header=True)
    # print("Saved File: {}".format(out_csv_points_grid))
    # ###########################
    
    
    print("Saving Speed Bin File")
    tracks_to_keep = ( (df_km_bins.PORT1.notnull()) & (df_km_bins.PORT2.notnull()) & (df_km_bins.PORT1!=df_km_bins.PORT2) ) &  ~( (((df_km_bins.PORT1<13) & (df_km_bins.RIGHT1==1)) | ((df_km_bins.PORT2<13) & (df_km_bins.RIGHT2==1))) )
    
    df_km_bins.columns = df_km_bins.columns.str.replace(' ', '') # removing whitespace in names
    df_km_bins.columns = df_km_bins.columns.str.lower() # lowercase everything
    
    #str_cols = ['group','group_agg','mainenginefueltype','route_name_interp','route_name','flag_country_mid','trackstarttime','port_id','interp_tracks']
    
    df_km_bins['time1'] =  pd.to_datetime(df_km_bins['time1'])
    df_km_bins['time2'] =  pd.to_datetime(df_km_bins['time2'])
    #df_km_bins[tracks_to_keep].to_stata(out_bins_dta,convert_dates={'time1':'td' , 'time2':'td'},write_index=False,version=117,convert_strl=df_km_bins.columns[df_km_bins.isnull().all() & df_km_bins.dtypes==object].tolist())
    #print("Saved File: {}".format(out_bins_dta))
    
    df_km_bins[tracks_to_keep].to_csv(out_bins_csv,sep=',',index=False,header=True)
    print("Saved File: {}".format(out_bins_csv))
    


if __name__ == "__main__" :
    main() 